Mine Common French Words from Wiktionary

Sites:


In [1]:
url_list = ["https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/1-2000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/2001-4000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/4001-6000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/6001-8000",
            "https://en.wiktionary.org/wiki/Wiktionary:French_frequency_lists/8001-10000"
           ]

In [2]:
from urllib.request import urlopen
from lxml import html
import unicodedata

In [3]:
french_words = list()
french_words_set = set()

In [4]:
# Function to strip accents
def strip_accents(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
              if unicodedata.category(c) != 'Mn')

# Function to process a word
def process_french_word(word):
    return strip_accents(word.lower())

In [5]:
for url in url_list:
    page_html = urlopen(url).read()
    tree = html.fromstring(page_html)
    word_list = tree.xpath('.//div/table//tr//li/span/a')
    for w in word_list:
        word = w.text
        proc_word = process_french_word(word)
        if proc_word not in french_words_set:
            french_words_set.add(proc_word)
            french_words.append(proc_word)

In [6]:
# Write words to a text file
f_out = open("french.txt", 'w')
for word in french_words:
    f_out.write(word+"\n")
f_out.close()

In [ ]: